#' ---
#' title: Reproduce Table 2 (PPP Merge Results)
#' author: Joe Ornstein
#' date: 2025-07-06
#' version: 0.2
#' ---

rm(list = ls())

library(tidyverse)
library(tinytable)

cat('\n\n**Application 2: Linking City Names**\n\n')

## Load hand-labeled record pairs from the three merged datasets ---------

# AFSM
afsm_labeled <- read_csv('data/cities-merge/afsm_results_no_embeddings_labeled.csv', progress = FALSE)
afsm_true_matches <- sum(afsm_labeled$label[afsm_labeled$hitl_score > 0.9])
afsm_precision <- round( mean(afsm_labeled$label[afsm_labeled$hitl_score > 0.9]) * 100, 1 )

# AFSM with embeddings
afsm_embeddings_labeled <- read_csv('data/cities-merge/afsm_results_with_embeddings_labeled.csv', progress = FALSE)
afsm_embeddings_true_matches <- sum(afsm_embeddings_labeled$label[afsm_embeddings_labeled$hitl_score > 0.9])
afsm_embeddings_precision <- round( mean(afsm_embeddings_labeled$label[afsm_embeddings_labeled$hitl_score > 0.9]) * 100, 1 )

# fuzzylink
fuzzylink_labeled <- readxl::read_xlsx('data/cities-merge/cities_fuzzylink_hand-coded.xlsx')
fuzzylink_true_matches <- sum(fuzzylink_labeled$hand_label)
fuzzylink_precision <- round( mean(fuzzylink_labeled$hand_label) * 100, 1 )

## Format Table 2 ---------------------

tb <- tribble(~`Algorithm`, ~`True Matches Identified`, ~`Precision`,
              'AFSM', afsm_true_matches, paste0(afsm_precision, '%'),
              'AFSM with Embeddings', afsm_embeddings_true_matches, paste0(afsm_embeddings_precision, '%'),
              'fuzzylink', fuzzylink_true_matches, paste0(fuzzylink_precision, '%'))

tb |>
  tt(digits = 3, caption = 'Performance metrics for city name merge across three algorithms.') |>
  format_tt(escape = TRUE) |>
  save_tt('tables/table2.tex', overwrite = TRUE)
